import pandas as pd
from ast import literal_eval
import warnings
warnings.filterwarnings('ignore')


brightcove_1 = pd.read_csv('brightcove sizes cleaned.csv', encoding='iso-8859-1')


brightcove = brightcove_1.copy()


brightcove.head()


df = brightcove[['duration','tags']].copy()
df['tags'] = df['tags'].apply(literal_eval) #convert to list type
df = df.explode('tags')


df['tags'] = df.tags.str.replace(r'(^.*ball.*$)', '8 baller')
df['tags'] = df.tags.str.replace(r'(^.*1450.*$)', '1450 club')
df['tags'] = df.tags.str.replace(r'(^.*a ndrew keene.*$)', 'andrew keene')
df['tags'] = df.tags.str.replace(r'(^.*ak.*$)', 'akshow')
df['tags'] = df.tags.str.replace(r'(^.*andrew.*$)', 'andrew keene')
df['tags'] = df.tags.str.replace(r'(^.*angels.*$)', 'angels and entrepreneurs')
df['tags'] = df.tags.str.replace(r'(^.*angels and entrepreneurs*$)', 'angels and entreprenuers')
df['tags'] = df.tags.str.replace(r'(^.*baldiwn.*$)', 'baldwin')
df['tags'] = df.tags.str.replace(r'(^.*bobby eight.*$)', 'bobby-eight')
df['tags'] = df.tags.str.replace(r'(^.*brandon-clip.*$)', 'brandon')
df['tags'] = df.tags.str.replace(r'(^.*brian.*$)', 'brian king')
df['tags'] = df.tags.str.replace(r'(^.*brian-clip*$)', 'brian king')
df['tags'] = df.tags.str.replace(r'(^.*brian-king*$)', 'brian king')
df['tags'] = df.tags.str.replace(r'(^.*chris.*$)', 'chris johnson')
df['tags'] = df.tags.str.replace(r'(^.*cj.*$)', 'cj')
df['tags'] = df.tags.str.replace(r'(^.*crupto.*$)', 'crypto')
df['tags'] = df.tags.str.replace(r'(^.*eric.*$)', 'eric hoff')
df['tags'] = df.tags.str.replace(r'(^.*fast.*$)', 'fast fortune')
df['tags'] = df.tags.str.replace(r'(^.*foundations .*$)', 'foundations trading')
df['tags'] = df.tags.str.replace(r'(^.*garrett.*$)', 'garrett baldwin')
df['tags'] = df.tags.str.replace(r'(^.*gbald.*$)', 'garrett baldwin')
df['tags'] = df.tags.str.replace(r'(^.*gbshow.*$)', 'garrett baldwin')
df['tags'] = df.tags.str.replace(r'(^.*hot.*$)', 'hot clip')
df['tags'] = df.tags.str.replace(r'(^.*hot .*$)', 'hot clip')
df['tags'] = df.tags.str.replace(r'(^.*hotclip.*$)', 'hot clip')
df['tags'] = df.tags.str.replace(r'(^.*jared.*$)', 'jared levy')
df['tags'] = df.tags.str.replace(r'(^.*kenny.*$)', 'kenny glick')
df['tags'] = df.tags.str.replace(r'(^.*live.*$)', 'live event')
df['tags'] = df.tags.str.replace(r'(^.*liz.*$)', 'liz-clip')
df['tags'] = df.tags.str.replace(r'(^.*locked.*$)', 'locked & loaded')
df['tags'] = df.tags.str.replace(r'(^.*main.*$)', 'main')
df['tags'] = df.tags.str.replace(r'(^.*mark.*$)', 'mark sebastian')
df['tags'] = df.tags.str.replace(r'(^.*mml.*$)', 'mml')
df['tags'] = df.tags.str.replace(r'(^.*monday.*$)', 'monday')
df['tags'] = df.tags.str.replace(r'(^.*morning.*$)', 'morning')
df['tags'] = df.tags.str.replace(r'(^.*ms.*$)', 'ms')
df['tags'] = df.tags.str.replace(r'(^.*nick.*$)', 'nick black')
df['tags'] = df.tags.str.replace(r'(^.*night.*$)', 'night trader')
df['tags'] = df.tags.str.replace(r'(^.*options.*$)', 'options')
df['tags'] = df.tags.str.replace(r'(^.*profit.*$)', 'profit')
df['tags'] = df.tags.str.replace(r'(^.*project 303.*$)', 'project 303')
df['tags'] = df.tags.str.replace(r'(^.*pump.*$)', 'pump')
df['tags'] = df.tags.str.replace(r'(^.*reg.*$)', 'reg')
df['tags'] = df.tags.str.replace(r'(^.*solid.*$)', 'solid jello')
df['tags'] = df.tags.str.replace(r'(^.*stinger.*$)', 'stinger')
df['tags'] = df.tags.str.replace(r'(^.*tim.*$)', 'tim melvin')
df['tags'] = df.tags.str.replace(r'(^.*tom.*$)', 'tom gentile')
df['tags'] = df.tags.str.replace(r'(^.*trade.*$)', 'trade')
df['tags'] = df.tags.str.replace(r'(^.*unicorn.*$)', 'unicorn hunters')
df['tags'] = df.tags.str.replace(r'(^.*volume.*$)', 'volume')
df['tags'] = df.tags.str.replace(r'(^.*vwap.*$)', 'vwap')
df['tags'] = df.tags.str.replace(r'(^.*warlock.*$)', 'warlock')
df['tags'] = df.tags.str.replace(r'(^.*warlock.*$)', 'warlock')
df['tags'] = df.tags.str.replace(r'(^.*wade.*$)', 'wade')
df['tags'] = df.tags.str.replace(r'(^.*world.*$)', 'world')


df = df.groupby('tags',as_index =False).mean()
df.columns = ['Tags', 'Average Duration']


df.head(20)

	Unnamed: 0	video_id	name	created_at	master_size	hls_renditions_size	mp4_renditions_size	audio_renditions_size	duration	created_by	folder_id	images	published_at	updated_at	updated_by	tags	created_by_id	updated_by_id	created_by_email	updated_by_email
0	0	6264776212001	CLF	2021-07-22	0.099985	0.293185	0.036815	0.009675	2.615817	{"type":"user","id":"76076235132","email":"EHo...	61f17b499d7e3f03e9fbc2c0	{"thumbnail":{"src":"https://cf-images.us-east...	2022-05-02 18:08:23.995000+00:00	2022-05-02 18:37:42.165000+00:00	{"type":"internal"}	['']	7.607624e+10	NaN	EHoff@cw.moneymappress.com	NaN
1	1	6268160131001	PLT Takes Out the Technical Trinity	2021-08-16	0.009995	0.206397	0.027653	0.006493	1.762133	{"type":"internal"}	626c69dcb0299755821f4eb3	{"poster":{"src":"https://cf-images.us-east-1....	2022-05-02 17:32:19.086000+00:00	2022-05-02 17:51:13.050000+00:00	{"type":"internal"}	['mmlive_replays']	NaN	NaN	NaN	NaN
2	2	6284596661001	Garrett Baldwin - REPLAY - 11-30-2021	2021-12-01	0.998009	3.820449	0.494816	0.114247	30.940800	{"type":"user","id":"76076235128","email":"nyt...	61683bcb13798a65ef63a329	{"thumbnail":{"src":"https://cf-images.us-east...	2022-05-02 17:15:37.172000+00:00	2022-05-02 18:22:47.635000+00:00	{"type":"user","id":"76076235127","email":"rsa...	['garrett baldwin']	7.607624e+10	7.607624e+10	nyteowl9@gmail.com	rsandusky@cw.moneymappress.com
3	3	6305270399001	Shah Gilani REPLAY - 27 April 2022	2022-04-27	0.997150	4.092209	0.529739	0.122784	33.143117	{"type":"user","id":"76076235127","email":"rsa...	611160bdbda6bb0af9aa53a3	{"poster":{"src":"https://cf-images.us-east-1....	2022-05-02 16:31:57.794000+00:00	2022-05-02 17:04:57.053000+00:00	{"type":"internal"}	['shah gilani']	7.607624e+10	NaN	rsandusky@cw.moneymappress.com	NaN
4	4	6293084125001	Bobby Eight - 8Ball - 01212022 MML	2022-01-22	0.996883	3.098494	0.405372	0.090642	25.436800	{"type":"user","id":"76076247291","email":"ami...	625eee609de8257ffa4d7472	{"poster":{"src":"https://cf-images.us-east-1....	2022-05-02 15:31:33.255000+00:00	2022-05-02 16:03:01.164000+00:00	{"type":"user","id":"76076235127","email":"rsa...	['8baller', 'main room', '8ball']	7.607625e+10	7.607624e+10	amine.cherouati94@gmail.com	rsandusky@cw.moneymappress.com

	Tags	Average Duration
0		16.573326
1	04/28/2021	5.942339
2	04/29/2021	12.041250
3	101s	2.688291
4	10mm_media	19.472717
5	1450 club	64.072744
6	303	31.123025
7	3m	2.627200
8	8 baller	33.659754
9	a & e	27.004800
10	akshow	42.314451
11	ama	3.631283
12	andrew keene	30.725688
13	angels and entreprenuers	28.556359
14	baldwin	31.002667
15	biggest	55.852258
16	bitcoin	49.644926
17	black	0.694050
18	bobby-eight	59.998729
19	boom	3.305600

Data Cleaning¶

Task: Merge Redundant Values¶